/*-------------------------------------------------------------------------
 * unicode_norm.c
 *		Normalize a Unicode string
 *
 * This implements Unicode normalization, per the documentation at
 * https://www.unicode.org/reports/tr15/.
 *
 * Portions Copyright (c) 2017-2020, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/common/unicode_norm.c
 *
 *-------------------------------------------------------------------------
 */
#ifndef FRONTEND
#include "postgres.h"
#else
#include "postgres_fe.h"
#endif

#include "common/unicode_norm.h"
#include "common/unicode_norm_table.h"
#ifndef FRONTEND
#include "common/unicode_normprops_table.h"
#endif

#ifndef FRONTEND
#define ALLOC(size) palloc(size)
#define FREE(size) pfree(size)
#else
#define ALLOC(size) malloc(size)
#define FREE(size) free(size)
#endif

/* Constants for calculations with Hangul characters */
#define SBASE		0xAC00		/* U+AC00 */
#define LBASE		0x1100		/* U+1100 */
#define VBASE		0x1161		/* U+1161 */
#define TBASE		0x11A7		/* U+11A7 */
#define LCOUNT		19
#define VCOUNT		21
#define TCOUNT		28
#define NCOUNT		VCOUNT * TCOUNT
#define SCOUNT		LCOUNT * NCOUNT

/* comparison routine for bsearch() of decomposition lookup table. */
static int
conv_compare(const void *p1, const void *p2)
{
    uint32		v1,
            v2;

    v1 = *(const uint32 *) p1;
    v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
    return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}

/*
 * Get the entry corresponding to code in the decomposition lookup table.
 */
static pg_unicode_decomposition *
get_code_entry(pg_wchar code)
{
    return bsearch(&(code),
                   UnicodeDecompMain,
                   lengthof(UnicodeDecompMain),
                   sizeof(pg_unicode_decomposition),
                   conv_compare);
}

/*
 * Given a decomposition entry looked up earlier, get the decomposed
 * characters.
 *
 * Note: the returned pointer can point to statically allocated buffer, and
 * is only valid until next call to this function!
 */
static const pg_wchar *
get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
{
    static pg_wchar x;

    if (DECOMPOSITION_IS_INLINE(entry))
    {
        Assert(DECOMPOSITION_SIZE(entry) == 1);
        x = (pg_wchar) entry->dec_index;
        *dec_size = 1;
        return &x;
    }
    else
    {
        *dec_size = DECOMPOSITION_SIZE(entry);
        return &UnicodeDecomp_codepoints[entry->dec_index];
    }
}

/*
 * Calculate how many characters a given character will decompose to.
 *
 * This needs to recurse, if the character decomposes into characters that
 * are, in turn, decomposable.
 */
static int
get_decomposed_size(pg_wchar code, bool compat)
{
    pg_unicode_decomposition *entry;
    int			size = 0;
    int			i;
    const uint32 *decomp;
    int			dec_size;

    /*
     * Fast path for Hangul characters not stored in tables to save memory as
     * decomposition is algorithmic. See
     * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
     * on the matter.
     */
    if (code >= SBASE && code < SBASE + SCOUNT)
    {
        uint32		tindex,
                sindex;

        sindex = code - SBASE;
        tindex = sindex % TCOUNT;

        if (tindex != 0)
            return 3;
        return 2;
    }

    entry = get_code_entry(code);

    /*
     * Just count current code if no other decompositions.  A NULL entry is
     * equivalent to a character with class 0 and no decompositions.
     */
    if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
        (!compat && DECOMPOSITION_IS_COMPAT(entry)))
        return 1;

    /*
     * If this entry has other decomposition codes look at them as well. First
     * get its decomposition in the list of tables available.
     */
    decomp = get_code_decomposition(entry, &dec_size);
    for (i = 0; i < dec_size; i++)
    {
        uint32		lcode = decomp[i];

        size += get_decomposed_size(lcode, compat);
    }

    return size;
}

/*
 * Recompose a set of characters. For hangul characters, the calculation
 * is algorithmic. For others, an inverse lookup at the decomposition
 * table is necessary. Returns true if a recomposition can be done, and
 * false otherwise.
 */
static bool
recompose_code(uint32 start, uint32 code, uint32 *result)
{
    /*
     * Handle Hangul characters algorithmically, per the Unicode spec.
     *
     * Check if two current characters are L and V.
     */
    if (start >= LBASE && start < LBASE + LCOUNT &&
        code >= VBASE && code < VBASE + VCOUNT)
    {
        /* make syllable of form LV */
        uint32		lindex = start - LBASE;
        uint32		vindex = code - VBASE;

        *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
        return true;
    }
        /* Check if two current characters are LV and T */
    else if (start >= SBASE && start < (SBASE + SCOUNT) &&
             ((start - SBASE) % TCOUNT) == 0 &&
             code >= TBASE && code < (TBASE + TCOUNT))
    {
        /* make syllable of form LVT */
        uint32		tindex = code - TBASE;

        *result = start + tindex;
        return true;
    }
    else
    {
        int			i;

        /*
         * Do an inverse lookup of the decomposition tables to see if anything
         * matches. The comparison just needs to be a perfect match on the
         * sub-table of size two, because the start character has already been
         * recomposed partially.
         */
        for (i = 0; i < lengthof(UnicodeDecompMain); i++)
        {
            const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];

            if (DECOMPOSITION_SIZE(entry) != 2)
                continue;

            if (DECOMPOSITION_NO_COMPOSE(entry))
                continue;

            if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
                code == UnicodeDecomp_codepoints[entry->dec_index + 1])
            {
                *result = entry->codepoint;
                return true;
            }
        }
    }

    return false;
}

/*
 * Decompose the given code into the array given by caller. The
 * decomposition begins at the position given by caller, saving one
 * lookup on the decomposition table. The current position needs to be
 * updated here to let the caller know from where to continue filling
 * in the array result.
 */
static void
decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
{
    pg_unicode_decomposition *entry;
    int			i;
    const uint32 *decomp;
    int			dec_size;

    /*
     * Fast path for Hangul characters not stored in tables to save memory as
     * decomposition is algorithmic. See
     * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
     * on the matter.
     */
    if (code >= SBASE && code < SBASE + SCOUNT)
    {
        uint32		l,
                v,
                tindex,
                sindex;
        pg_wchar   *res = *result;

        sindex = code - SBASE;
        l = LBASE + sindex / (VCOUNT * TCOUNT);
        v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
        tindex = sindex % TCOUNT;

        res[*current] = l;
        (*current)++;
        res[*current] = v;
        (*current)++;

        if (tindex != 0)
        {
            res[*current] = TBASE + tindex;
            (*current)++;
        }

        return;
    }

    entry = get_code_entry(code);

    /*
     * Just fill in with the current decomposition if there are no
     * decomposition codes to recurse to.  A NULL entry is equivalent to a
     * character with class 0 and no decompositions, so just leave also in
     * this case.
     */
    if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
        (!compat && DECOMPOSITION_IS_COMPAT(entry)))
    {
        pg_wchar   *res = *result;

        res[*current] = code;
        (*current)++;
        return;
    }

    /*
     * If this entry has other decomposition codes look at them as well.
     */
    decomp = get_code_decomposition(entry, &dec_size);
    for (i = 0; i < dec_size; i++)
    {
        pg_wchar	lcode = (pg_wchar) decomp[i];

        /* Leave if no more decompositions */
        decompose_code(lcode, compat, result, current);
    }
}

/*
 * unicode_normalize - Normalize a Unicode string to the specified form.
 *
 * The input is a 0-terminated array of codepoints.
 *
 * In frontend, returns a 0-terminated array of codepoints, allocated with
 * malloc. Or NULL if we run out of memory. In backend, the returned
 * string is palloc'd instead, and OOM is reported with ereport().
 */
pg_wchar *
unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
{
    bool		compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
    bool		recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
    pg_wchar   *decomp_chars;
    pg_wchar   *recomp_chars;
    int			decomp_size,
            current_size;
    int			count;
    const pg_wchar *p;

    /* variables for recomposition */
    int			last_class;
    int			starter_pos;
    int			target_pos;
    uint32		starter_ch;

    /* First, do character decomposition */

    /*
     * Calculate how many characters long the decomposed version will be.
     */
    decomp_size = 0;
    for (p = input; *p; p++)
        decomp_size += get_decomposed_size(*p, compat);

    decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
    if (decomp_chars == NULL)
        return NULL;

    /*
     * Now fill in each entry recursively. This needs a second pass on the
     * decomposition table.
     */
    current_size = 0;
    for (p = input; *p; p++)
        decompose_code(*p, compat, &decomp_chars, &current_size);
    decomp_chars[decomp_size] = '\0';
    Assert(decomp_size == current_size);

    /*
     * Now apply canonical ordering.
     */
    for (count = 1; count < decomp_size; count++)
    {
        pg_wchar	prev = decomp_chars[count - 1];
        pg_wchar	next = decomp_chars[count];
        pg_wchar	tmp;
        pg_unicode_decomposition *prevEntry = get_code_entry(prev);
        pg_unicode_decomposition *nextEntry = get_code_entry(next);

        /*
         * If no entries are found, the character used is either an Hangul
         * character or a character with a class of 0 and no decompositions,
         * so move to next result.
         */
        if (prevEntry == NULL || nextEntry == NULL)
            continue;

        /*
         * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
         * annex 4, a sequence of two adjacent characters in a string is an
         * exchangeable pair if the combining class (from the Unicode
         * Character Database) for the first character is greater than the
         * combining class for the second, and the second is not a starter.  A
         * character is a starter if its combining class is 0.
         */
        if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
            continue;

        if (prevEntry->comb_class <= nextEntry->comb_class)
            continue;

        /* exchange can happen */
        tmp = decomp_chars[count - 1];
        decomp_chars[count - 1] = decomp_chars[count];
        decomp_chars[count] = tmp;

        /* backtrack to check again */
        if (count > 1)
            count -= 2;
    }

    if (!recompose)
        return decomp_chars;

    /*
     * The last phase of NFC and NFKC is the recomposition of the reordered
     * Unicode string using combining classes. The recomposed string cannot be
     * longer than the decomposed one, so make the allocation of the output
     * string based on that assumption.
     */
    recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
    if (!recomp_chars)
    {
        FREE(decomp_chars);
        return NULL;
    }

    last_class = -1;			/* this eliminates a special check */
    starter_pos = 0;
    target_pos = 1;
    starter_ch = recomp_chars[0] = decomp_chars[0];

    for (count = 1; count < decomp_size; count++)
    {
        pg_wchar	ch = decomp_chars[count];
        pg_unicode_decomposition *ch_entry = get_code_entry(ch);
        int			ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
        pg_wchar	composite;

        if (last_class < ch_class &&
            recompose_code(starter_ch, ch, &composite))
        {
            recomp_chars[starter_pos] = composite;
            starter_ch = composite;
        }
        else if (ch_class == 0)
        {
            starter_pos = target_pos;
            starter_ch = ch;
            last_class = -1;
            recomp_chars[target_pos++] = ch;
        }
        else
        {
            last_class = ch_class;
            recomp_chars[target_pos++] = ch;
        }
    }
    recomp_chars[target_pos] = (pg_wchar) '\0';

    FREE(decomp_chars);

    return recomp_chars;
}

/*
 * Normalization "quick check" algorithm; see
 * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
 */

/* We only need this in the backend. */
#ifndef FRONTEND

static uint8
get_canonical_class(pg_wchar ch)
{
    pg_unicode_decomposition *entry = get_code_entry(ch);

    if (!entry)
        return 0;
    else
        return entry->comb_class;
}

static const pg_unicode_normprops *
qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
{
    int			h;
    uint32		hashkey;

    /*
     * Compute the hash function. The hash key is the codepoint with the bytes
     * in network order.
     */
    hashkey = htonl(ch);
    h = norminfo->hash(&hashkey);

    /* An out-of-range result implies no match */
    if (h < 0 || h >= norminfo->num_normprops)
        return NULL;

    /*
     * Since it's a perfect hash, we need only match to the specific codepoint
     * it identifies.
     */
    if (ch != norminfo->normprops[h].codepoint)
        return NULL;

    /* Success! */
    return &norminfo->normprops[h];
}

/*
 * Look up the normalization quick check character property
 */
static UnicodeNormalizationQC
qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
{
    const pg_unicode_normprops *found = NULL;

    switch (form)
    {
        case UNICODE_NFC:
            found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
            break;
        case UNICODE_NFKC:
            found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
            break;
        default:
            Assert(false);
            break;
    }

    if (found)
        return found->quickcheck;
    else
        return UNICODE_NORM_QC_YES;
}

UnicodeNormalizationQC
unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
{
    uint8		lastCanonicalClass = 0;
    UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;

    /*
     * For the "D" forms, we don't run the quickcheck.  We don't include the
     * lookup tables for those because they are huge, checking for these
     * particular forms is less common, and running the slow path is faster
     * for the "D" forms than the "C" forms because you don't need to
     * recompose, which is slow.
     */
    if (form == UNICODE_NFD || form == UNICODE_NFKD)
        return UNICODE_NORM_QC_MAYBE;

    for (const pg_wchar *p = input; *p; p++)
    {
        pg_wchar	ch = *p;
        uint8		canonicalClass;
        UnicodeNormalizationQC check;

        canonicalClass = get_canonical_class(ch);
        if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
            return UNICODE_NORM_QC_NO;

        check = qc_is_allowed(form, ch);
        if (check == UNICODE_NORM_QC_NO)
            return UNICODE_NORM_QC_NO;
        else if (check == UNICODE_NORM_QC_MAYBE)
            result = UNICODE_NORM_QC_MAYBE;

        lastCanonicalClass = canonicalClass;
    }
    return result;
}

#endif							/* !FRONTEND */